# change notebook width and font
from IPython.core.display import display, HTML
display(HTML("""<style>
/* source: http://stackoverflow.com/a/24207353 */
.container { width:95% !important; }
div.prompt, div.CodeMirror pre, div.output_area pre { font-family:'Hack', monospace; font-size: 10.5pt; }
</style>"""))
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
# for use without internet connection, use "connected = False"
# (this will add 5 MB to the size of the notebook!)
connected = False
# initialize the plotly plotting library
init_notebook_mode(connected=connected)
# read expression matrix
import pandas as pd
expression_file = '~/Dropbox/work/shared/itai_clustering/PBMC_4K/pbmc4k_expression.tsv'
matrix = pd.read_csv(expression_file, sep='\t', index_col=0, header=0)
print('The shape of the expression matrix is:', matrix.shape)
# normalize to the median and transform using the Freeman-Tukey transformation
import numpy as np
def ft_transform(matrix):
"""Apply the Freeman-Tukey transformation."""
return np.sqrt(matrix) + np.sqrt(matrix + 1)
def normalize(matrix):
"""Normalize each profile to the median transcript count."""
num_transcripts = matrix.sum(axis=0)
transcript_count = num_transcripts.median()
norm_matrix = (transcript_count / num_transcripts) * matrix
return norm_matrix
trans_matrix = ft_transform(normalize(matrix))
# perform PCA, get a matrix with the scores for the first 20 PCs
from sklearn.decomposition import PCA
num_components = 20
seed = 0
# scikit-learn can perform a truncated PCA
# using the randomized PCA algorithm by Halko et al.,
# which is much faster than computing
# all principal components
pca_model = PCA(
n_components=num_components,
svd_solver='randomized',
random_state=0)
pc_scores = pca_model.fit_transform(trans_matrix.T).T
print('The first %d PCs explain %.1f %% of the total variance.'
% (num_components, 100*pca_model.explained_variance_ratio_.sum()))
print('The shape of the PC score matrix is:', pc_scores.shape)
from sklearn.manifold import TSNE
perplexity = 30
seed = 0
# scikit-learn uses the Barnes-Hut algorithm for t-SNE
tsne_model = TSNE(perplexity=perplexity, random_state=seed)
Y = tsne_model.fit_transform(pc_scores.T).T
# convert numpy array to pandas DataFrame
dim_labels = ['tSNE dim. 1', 'tSNE dim. 2']
tsne_scores = pd.DataFrame(data=Y, index=dim_labels, columns=matrix.columns)
print('The shape of the t-SNE score matrix is:', tsne_scores.shape)
trace = go.Scatter(
x=tsne_scores.iloc[0],
y=tsne_scores.iloc[1],
mode='markers',
marker=dict(size=7, opacity=0.7),
)
data = [trace]
layout = go.Layout(
width=820,
height=800,
font=dict(size=32, family='serif'),
xaxis=dict(title='tSNE dim. 1', zeroline=False, showline=True, tickvals=[], showgrid=False),
yaxis=dict(title='tSNE dim. 2', zeroline=False, showline=True, tickvals=[], showgrid=False),
margin=dict(l=60, b=60),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
from sklearn.cluster import DBSCAN
def cluster_cells_dbscan(
tsne_scores,
min_cells_frac = 0.01,
eps_frac = 0.05,
seed = 0):
"""Cluster cells using DBSCAN."""
# convert parameters
ptp = np.ptp(tsne_scores.values, axis=1)
eps = eps_frac * pow(np.sum(np.power(ptp, 2.0)), 0.5)
min_cells = min_cells_frac * tsne_scores.shape[1]
print('Min cells: %d' % min_cells)
print('Neighborhood: %.2f' % eps)
# perform DBSCAN
model = DBSCAN(algorithm='brute', min_samples=min_cells, eps=eps)
y = model.fit_predict(tsne_scores.T)
# convert numpy array to pandas Series
cell_labels = pd.Series(index=matrix.columns, data=y)
# fix cluster labels
vc = cell_labels.value_counts()
cluster_labels = dict([
(id_, 'Cluster %d' % (i+1)) for i, id_ in enumerate(vc.index)])
cluster_labels[-1] = 'Outliers'
cell_labels = cell_labels.map(cluster_labels)
cluster_labels = list(cluster_labels.values())
return cell_labels, cluster_labels
min_cells_frac = 0.008
eps_frac = 0.03
cell_labels, cluster_labels = cluster_cells_dbscan(
tsne_scores, min_cells_frac, eps_frac)
# print cluster sizes
print(cell_labels.value_counts())
data = []
for cl in cluster_labels:
sel = (cell_labels == cl)
sel_tsne_scores = tsne_scores.loc[:, sel]
trace = go.Scatter(
x=sel_tsne_scores.iloc[0],
y=sel_tsne_scores.iloc[1],
mode='markers',
marker=dict(size=7, opacity=0.7),
name=cl,
)
data.append(trace)
layout = go.Layout(
width=1000,
height=800,
font=dict(size=32, family='serif'),
xaxis=dict(title='tSNE dim. 1', zeroline=False, showline=True, tickvals=[], showgrid=False),
yaxis=dict(title='tSNE dim. 2', zeroline=False, showline=True, tickvals=[], showgrid=False),
margin=dict(l=60, b=60),
legend=dict(borderwidth=1, bordercolor='black'),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
sel_genes = ['CD3D', 'FTL', 'CD79B']
def normalize(matrix):
"""Normalize each profile to the median transcript count."""
num_transcripts = matrix.sum(axis=0)
transcript_count = num_transcripts.median()
norm_matrix = (transcript_count / num_transcripts) * matrix
return norm_matrix
norm_matrix = normalize(matrix)
for gene in sel_genes:
profile = norm_matrix.loc[gene]
# we're calibrating the color scale
# so that we're only showing values
# within two standard deviations of the mean
mean = profile.mean()
std = profile.std(ddof=1)
cmin = max(mean - 2*std, 0)
cmax = mean + 2*std
trace = go.Scatter(
x=tsne_scores.iloc[0],
y=tsne_scores.iloc[1],
mode='markers',
marker=dict(
size=7,
opacity=0.7,
color=profile,
cmin=cmin,
cmax=cmax,
showscale=True,
colorbar=dict(
len=0.6,
title='Normal. transcripts',
titleside='right')
),
)
data = [trace]
layout = go.Layout(
width=820,
height=800,
font=dict(size=32, family='serif'),
xaxis=dict(title='tSNE dim. 1', zeroline=False, showline=True, tickvals=[], showgrid=False),
yaxis=dict(title='tSNE dim. 2', zeroline=False, showline=True, tickvals=[], showgrid=False),
margin=dict(l=60, b=60),
title='Gene: <i>%s</i>' % gene,
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)